Skip to main content

The PDF and Images Challenge

What makes PDF and images “unstructured”:
  • No consistent schema
  • Mixed content types (text, images, tables)
  • Varied formats (PDFs, Word docs, slides)
  • Quality issues (scans, OCR errors, formatting)
Why it matters:
  • 80% of enterprise data is unstructured
  • Most business value is locked in documents
  • Poor processing = poor retrieval = poor answers

PDF Processing: Digital vs. Scanned

Not all PDFs are created equal.

Digital PDFs (Text-Based)

# Digital PDFs: Extract text directly (fast, accurate)
import PyPDF2

def extract_digital_pdf(pdf_path: str) -> str:
    """Extract text from digital (text-based) PDF."""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n\n"
        
        return text.strip()

# Fast and accurate for PDFs created from digital sources
# (Word exports, generated reports, etc.)

Scanned PDFs (Image-Based)

# Scanned PDFs: Require OCR (slow, error-prone)
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

def extract_scanned_pdf(pdf_path: str) -> str:
    """
    Extract text from scanned PDF using OCR.
    Warning: Slow (5-10 sec per page) and error-prone.
    """
    # Convert PDF pages to images
    images = convert_from_path(pdf_path, dpi=300)  # Higher DPI = better OCR
    
    text = ""
    for i, image in enumerate(images):
        # Apply OCR to each page
        page_text = pytesseract.image_to_string(
            image,
            lang='eng',  # Specify language
            config='--psm 3'  # Page segmentation mode: auto
        )
        
        text += f"Page {i + 1}:\n{page_text}\n\n"
    
    return text.strip()

# OCR challenges:
# - Handwriting: 60-80% accuracy (unreliable)
# - Low-quality scans: Poor results
# - Tables: Often mangled
# - Processing time: 5-10 seconds per page

Combined PDF Processing Pipeline

from typing import Optional
import logging

class PDFProcessor:
    """Production-grade PDF processing with fallbacks."""
    
    def __init__(self):
        self.logger = logging.getLogger(__name__)
    
    def is_digital_pdf(self, pdf_path: str) -> bool:
        """Heuristic: Check if PDF has extractable text."""
        try:
            sample = extract_digital_pdf(pdf_path)
            # If we extracted meaningful text (> 100 chars), likely digital
            return len(sample.strip()) > 100
        except:
            return False
    
    def process_pdf(self, pdf_path: str) -> dict:
        """
        Process PDF with appropriate method.
        Returns: {text: str, method: str, confidence: float}
        """
        result = {
            "text": "",
            "method": "unknown",
            "confidence": 0.0,
            "warnings": []
        }
        
        # Try digital extraction first (fast path)
        if self.is_digital_pdf(pdf_path):
            result["text"] = extract_digital_pdf(pdf_path)
            result["method"] = "digital"
            result["confidence"] = 0.95
            self.logger.info(f"Digital extraction successful: {pdf_path}")
            
        else:
            # Fall back to OCR (slow path)
            self.logger.warning(f"Using OCR for: {pdf_path}")
            result["text"] = extract_scanned_pdf(pdf_path)
            result["method"] = "ocr"
            result["confidence"] = 0.70  # OCR less reliable
            result["warnings"].append("OCR used - may contain errors")
        
        # Quality checks
        if len(result["text"]) < 50:
            result["warnings"].append("Very short text extracted - possible failure")
            result["confidence"] *= 0.5
        
        return result

# Usage
processor = PDFProcessor()

# Digital PDF (fast)
result = processor.process_pdf("digital_report.pdf")
# >>> method: 'digital', confidence: 0.95, time: ~100ms

# Scanned PDF (slow)
result = processor.process_pdf("scanned_form.pdf")
# >>> method: 'ocr', confidence: 0.70, time: ~30 seconds

Table Extraction: The Hard Problem

Tables are notoriously difficult for OCR and even digital PDFs. Why tables are hard:
  • OCR sees tables as disconnected text blocks
  • Column alignment information is lost
  • Multi-line cells get fragmented
  • Headers vs. data distinction unclear
Solutions:

1. Specialized Table Extractors

# Using camelot for better table extraction
import camelot

def extract_tables_from_pdf(pdf_path: str) -> list:
    """
    Extract tables using specialized library.
    Works best on digital PDFs with clear table borders.
    """
    # Extract all tables from PDF
    tables = camelot.read_pdf(
        pdf_path,
        pages='all',
        flavor='lattice'  # For bordered tables (vs 'stream' for unbordered)
    )
    
    extracted = []
    for table in tables:
        # Get table as pandas DataFrame
        df = table.df
        
        # Convert to structured format
        extracted.append({
            "page": table.page,
            "dataframe": df,
            "text": df.to_string(),  # For RAG embedding
            "markdown": df.to_markdown()  # For LLM consumption
        })
    
    return extracted

# Example usage
tables = extract_tables_from_pdf("financial_report.pdf")
for t in tables:
    print(f"Found table on page {t['page']}")
    print(t['markdown'])
    # Store markdown in RAG system for better LLM understanding

2. Vision-Language Models for Tables

Modern approach: Use multimodal models to “see” the table.
# Using GPT-4 Vision to understand table structure
import base64

def extract_table_with_vision(image_path: str) -> str:
    """
    Use vision model to extract and structure table.
    More robust than OCR for complex layouts.
    """
    # Encode image
    with open(image_path, 'rb') as f:
        image_data = base64.b64encode(f.read()).decode()
    
    response = openai.chat.completions.create(
        model="gpt-4o",  # Vision-capable model
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Extract this table into markdown format. Preserve all data and structure."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image_data}"
                        }
                    }
                ]
            }
        ],
        max_tokens=1000
    )
    
    return response.choices[0].message.content

# This works remarkably well even for:
# - Hand-drawn tables
# - Complex multi-level headers
# - Tables with merged cells
# - Tables embedded in larger documents
Production recommendation:
  1. Try specialized extractor (camelot) first - fast and accurate for clean tables
  2. Fall back to vision model for complex/messy tables - slower but more robust
  3. Store both raw table and structured version in metadata

Image Integration in RAG

Should you include images in your RAG pipeline? Depends on the use case.

When to Extract Text from Images

# Image contains primarily text (signage, screenshots, diagrams with labels)
from PIL import Image
import pytesseract

def extract_image_text(image_path: str) -> str:
    """OCR text from image."""
    image = Image.open(image_path)
    
    # Preprocessing can improve OCR
    # - Convert to grayscale
    # - Increase contrast
    # - Denoise
    image = image.convert('L')  # Grayscale
    
    text = pytesseract.image_to_string(image)
    return text.strip()

When to Caption/Describe Images

# Image contains visual information (charts, photos, diagrams)
def generate_image_caption(image_path: str) -> str:
    """
    Generate descriptive caption using vision model.
    Better for images where visual information matters.
    """
    with open(image_path, 'rb') as f:
        image_data = base64.b64encode(f.read()).decode()
    
    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Describe this image in detail. Focus on key information someone would need to understand its content and context."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image_data}"
                        }
                    }
                ]
            }
        ],
        max_tokens=300
    )
    
    return response.choices[0].message.content

# Example output for a bar chart:
# "A bar chart showing quarterly revenue from Q1 to Q4 2024. 
#  Q1: $2.3M, Q2: $2.8M, Q3: $3.1M, Q4: $3.5M. 
#  Clear upward trend with 52% YoY growth."

Decision Framework: Text vs. Caption

Image TypeApproachReason
Screenshots of code/logsOCR text extractionText is primary content
Charts/graphsCaption with dataVisual info + specific values
Diagrams with labelsBoth (OCR + caption)Labels + structural understanding
Photos of productsCaptionVisual features matter
Scanned text documentsOCRText is the content
InfographicsCaptionMix of visual + text

Handling Long Documents

Long documents (100+ pages) present unique challenges:
  • Can’t fit entire document in LLM context window
  • Need to aggregate information across sections
  • Must maintain document-level context

Pattern 1: Constant-Output Tasks

Use case: Finding specific information (needle in haystack)
def needle_in_haystack_search(
    document_chunks: list[str],
    query: str
) -> str:
    """
    For constant-output tasks (finding specific fact).
    Retrieve most relevant chunks, generate answer.
    """
    # Standard RAG approach works fine
    relevant_chunks = retriever.search(query, chunks=document_chunks, top_k=5)
    
    context = "\n\n".join(relevant_chunks)
    answer = llm.generate(f"Context: {context}\n\nQuery: {query}")
    
    return answer

# Example: "What is the return policy?" (one specific answer)

Pattern 2: Variable-Output Tasks

Use case: Summarization, analysis (output scales with document length)
def hierarchical_summary(
    document_chunks: list[str],
    summary_type: str = "comprehensive"
) -> str:
    """
    For variable-output tasks (summarization).
    Uses map-reduce pattern: chunk summaries → final summary.
    """
    # Step 1: Summarize each chunk independently (MAP phase)
    chunk_summaries = []
    for chunk in document_chunks:
        summary = llm.generate(
            f"Summarize this section concisely:\n\n{chunk}",
            max_tokens=150
        )
        chunk_summaries.append(summary)
    
    # Step 2: Combine chunk summaries into final summary (REDUCE phase)
    combined = "\n\n".join(chunk_summaries)
    
    final_summary = llm.generate(
        f"Create a {summary_type} summary from these section summaries:\n\n{combined}",
        max_tokens=500
    )
    
    return final_summary

# This pattern scales to documents of any length
# Cost: O(num_chunks) API calls, but manageable

A Complete Unstructured Data Pipeline

from pathlib import Path
from typing import List, Dict
import mimetypes

class UnstructuredDataPipeline:
    """Complete pipeline for processing varied document types."""
    
    def __init__(self):
        self.pdf_processor = PDFProcessor()
        self.supported_types = {
            'application/pdf': self.process_pdf,
            'image/jpeg': self.process_image,
            'image/png': self.process_image,
            'text/plain': self.process_text,
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': self.process_docx
        }
    
    def process_document(self, file_path: str) -> Dict:
        """Route document to appropriate processor."""
        mime_type, _ = mimetypes.guess_type(file_path)
        
        if mime_type not in self.supported_types:
            raise ValueError(f"Unsupported file type: {mime_type}")
        
        processor = self.supported_types[mime_type]
        return processor(file_path)
    
    def process_pdf(self, path: str) -> Dict:
        """Process PDF with quality checks."""
        result = self.pdf_processor.process_pdf(path)
        
        # Extract tables separately if present
        tables = extract_tables_from_pdf(path)
        if tables:
            result['tables'] = tables
            result['warnings'].append(f"Found {len(tables)} tables")
        
        return result
    
    def process_image(self, path: str) -> Dict:
        """Process image (OCR or caption based on content)."""
        # Heuristic: If image is mostly text (screenshot), OCR
        # If image is visual (photo, chart), caption
        
        text = extract_image_text(path)
        caption = generate_image_caption(path)
        
        # Combine both for rich representation
        return {
            "text": text,
            "caption": caption,
            "method": "image_processing",
            "confidence": 0.85
        }
    
    def process_batch(
        self,
        directory: str,
        file_pattern: str = "*.*"
    ) -> List[Dict]:
        """Process all documents in directory."""
        results = []
        
        for file_path in Path(directory).glob(file_pattern):
            try:
                result = self.process_document(str(file_path))
                result['source'] = str(file_path)
                results.append(result)
            except Exception as e:
                logging.error(f"Failed to process {file_path}: {e}")
        
        return results

# Production usage
pipeline = UnstructuredDataPipeline()

# Process entire document corpus
results = pipeline.process_batch("./documents/")

# Store in RAG system
for doc in results:
    if doc['confidence'] > 0.7:  # Quality threshold
        store_in_vector_db(
            content=doc['text'],
            metadata={
                'source': doc['source'],
                'method': doc['method'],
                'confidence': doc['confidence'],
                'warnings': doc.get('warnings', [])
            }
        )

Practical Exercise (20 min)

Process a mixed document corpus:
# Your task:
# 1. Process the text document at 'Assets/paul_graham_essay.txt' as 'text/plain'
# 2. Apply your preferred chunking strategy from Lesson 2.3 and store chunk metadata
# 3. (Optional) If you have one image and one PDF available, run image captioning/OCR and PDF extraction for comparison
# 4. Measure processing time and quality signals (e.g., word counts, confidence)
# 5. Identify any quality risks (e.g., overly short chunks, noisy OCR)

# Expected insights:
# - Text/plain path is straightforward and high-confidence
# - Chunking choice significantly affects later retrieval quality
# - Vision/OCR add latency and uncertainty when used